Data<- read.csv(file = 'Final/BoT_Cluster_ENGR_Updated.csv',row.names=1)
BoT <- Data[,-c(1,2)]
dcan <- dist(BoT, method = "euclidean")
hcan <- hclust(dcan,method="complete")
plot(hcan, axes = FALSE, ann = FALSE, main = NA, labels = FALSE, hang = 0.01)
We can see from the figure that the best choice for total number of clusters is 3. Comparing it with the original cohort id.
HC_cut <- cutree(hcan, 3)
table(Data$cohort_id, HC_cut)
## HC_cut
## 1 2 3
## 1 28 23 5
## 6 25 25 8
## 10 36 37 13
## 11 2 10 2
## 12 17 12 3
## 15 27 21 7
## 16 15 15 1
## 17 21 21 4
## 22 60 78 19
## 28 16 26 8
## 30 17 26 3
## 35 23 27 5
## 39 21 30 9
## 40 19 20 7
## 41 26 16 1
We will save the result to perform test.
Data_HC_Result <- BoT %>% mutate(Cluster = HC_cut)
(Data_HC_Result_Summarized <- BoT %>%
mutate(Cluster = HC_cut) %>%
group_by(Cluster) %>%
summarise_all("mean"))
## # A tibble: 3 × 9
## Cluster Control SpeakUp Extraversion BT_PastGroups BT_PastPositive
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3.64 5.22 4.99 2.69 4.00
## 2 2 3.44 3.68 4.36 2.60 3.81
## 3 3 4.83 3.41 2.72 2.61 3.61
## # … with 3 more variables: GroupPreference <dbl>, Procrastination <dbl>,
## # BT_Belongingness <dbl>
write.csv(Data_HC_Result, "Final/Data_HC_Result.csv", row.names=FALSE)
write.csv(Data_HC_Result_Summarized, "Final/Data_HC_Result_Summarized.csv", row.names=FALSE)
Look at the histogram for each cluster
plt <- htmltools::tagList()
for(i in 1:8){
plt[[i]] <- as.widget(ggplotly(ggplot(Data_HC_Result,
aes(x=as.factor(.data[[colnames(Data_HC_Result)[i]]]), fill=as.factor(Cluster), color=as.factor(Cluster))) +
geom_histogram(position="dodge",stat="count") ))
}
Using Kruskal-Wallis test as the non-parametric alternative to one-way ANOVA test, since I believe my data is not normally distributed. Also, the original data is of ordinal type.
# pairwise.wilcox.test(Data_HC_Result[["Control"]], Data_HC_Result$Cluster, p.adjust.method = "BH")
for(i in 1:8){
print(pairwise.wilcox.test(Data_HC_Result[[colnames(Data_HC_Result)[i]]], Data_HC_Result$Cluster, p.adjust.method = "BH"))
}
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.092 -
## 3 2.5e-12 < 2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 <2e-16 0.04
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 5e-11 -
## 3 <2e-16 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.12 -
## 3 0.26 0.99
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.00085 -
## 3 1.6e-05 0.01404
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.64 -
## 3 0.19 0.19
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.3 1e-14
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 5.0e-14 -
## 3 1.3e-08 1
##
## P value adjustment method: BH
ggplotly(fviz_nbclust(BoT, kmeans, method = "wss"))
ggplotly(fviz_nbclust(BoT, kmeans, method = "sil"))
fviz_nbclust(BoT, kmeans, method = "gap_stat")
We can see from various figures that the best choice for total number of clusters is either 2 (wss and silhouette) or 3 (gap_stats and dendogram from HC). For this paper, we will choose 3 clusters for consistency with HC.
k3 <- kmeans(BoT, centers = 3, nstart = 25)
fviz_cluster(k3, data = BoT,geom = "point")
table(Data$cohort_id, k3$cluster)
##
## 1 2 3
## 1 23 16 17
## 6 13 23 22
## 10 16 30 40
## 11 4 8 2
## 12 11 6 15
## 15 18 16 21
## 16 7 10 14
## 17 10 16 20
## 22 39 69 49
## 28 9 29 12
## 30 15 15 16
## 35 18 19 18
## 39 19 22 19
## 40 11 21 14
## 41 20 10 13
We will save the result to perform test.
Data_Kmeans_Result <- BoT %>% mutate(Cluster = k3$cluster)
(Data_Kmeans_Result_Summarized <- BoT %>%
mutate(Cluster = k3$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean"))
## # A tibble: 3 × 9
## Cluster Control SpeakUp Extraversion BT_PastGroups BT_PastPositive
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 5.04 4.97 5.39 2.74 3.81
## 2 2 3.82 3.24 3.10 2.43 3.69
## 3 3 2.46 4.89 5.10 2.78 4.11
## # … with 3 more variables: GroupPreference <dbl>, Procrastination <dbl>,
## # BT_Belongingness <dbl>
write.csv(Data_Kmeans_Result, "Final/Data_Kmeans_Result.csv", row.names=FALSE)
write.csv(Data_Kmeans_Result_Summarized, "Final/Data_Kmeans_Result_Summarized.csv", row.names=FALSE)
Look at the histogram for each cluster
plt2 <- htmltools::tagList()
for(i in 1:8){
plt2[[i]] <- as.widget(ggplotly(ggplot(Data_Kmeans_Result,
aes(x=as.factor(.data[[colnames(Data_Kmeans_Result)[i]]]), fill=as.factor(Cluster), color=as.factor(Cluster))) +
geom_histogram(position="dodge",stat="count") ))
}
Using Kruskal-Wallis test as the non-parametric alternative to one-way ANOVA test, since I believe my data is not normally distributed. Also, the original data is of ordinal type.
# pairwise.wilcox.test(Data_HC_Result[["Control"]], Data_HC_Result$Cluster, p.adjust.method = "BH")
for(i in 1:8){
print(pairwise.wilcox.test(Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]], Data_Kmeans_Result$Cluster, p.adjust.method = "BH"))
}
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 <2e-16 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.43 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.0057 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 3.3e-09 -
## 3 0.55 3.7e-12
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 0.12 -
## 3 9.2e-07 9.8e-12
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 0.46 -
## 3 6.1e-13 2.1e-15
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 0.00073 -
## 3 1.4e-05 0.13691
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.23 <2e-16
##
## P value adjustment method: BH